#################################################################################################
##### This program defines all the functions used for 2-step DR-Selection MLE and inference #####
##### Model refers to "Distribution Regression with Selection"
##### Authors: Chernozhukov, V., I. Fernandez-Val and S. Luo
##### Last Update: 6/17/2018



# require(maxLik);
# require(sampleSelection);
# require(parallel);
# require(abind);
# require(mvtnorm);
# require(MASS);
#####################################################
# Functions only related to conditional probability #
#####################################################
# Function used for column vector product of two matrices #
rowprod <- function(i, A, B){ 
  tcrossprod(A[i,],B[i,])
};

# CDF of bivariate normal distribution #
bi.norm <- function(upper_rho){ 
  rho <- tail(upper_rho,1);
  cor_matrix <- matrix(c(1,rho,rho,1), 2);
  return(pmvnorm(upper = head(upper_rho,-1), lower=c(-Inf,-Inf), sigma=cor_matrix)[1]); # or algorithm=TVPACK;
};

# PDF of bivariate normal distribution #
d.bi.norm <- function(upper_rho){ 
  dmvnorm(head(upper_rho,-1),sigma=matrix(c(1,tail(upper_rho,1),tail(upper_rho,1),1), 2));
};

# Define the conditional probability of outcome variable #
prob <- function(beta, rho, Xwc, Zpihat){
  xb <- Xwc %*% beta;
  
  prob.vect <- apply(cbind(xb,Zpihat,rho),1,bi.norm);
  
  return(prob.vect);
};

# First derivative of conditional probability #
dprob <- function(beta,rho,Xwc,Zpihat,W){
  xb <- Xwc %*% beta;
  
  dprob.beta <- c(pnorm((Zpihat-rho*xb)/sqrt(1-rho^2))*dnorm(xb))*Xwc; # N-by-K matrix;
  
  dprob.delta <- c(1-rho^2)*apply(cbind(xb,Zpihat,rho), 1, d.bi.norm)*W; # N-by-n.delta matrix;
  
  colnames(dprob.delta) <- paste("delta.",colnames(W),sep="");
  # for 2-step MLE, the denominator can be removed
  
  return(cbind(dprob.beta,dprob.delta));
};


# Second derivative of conditional probability #
ddprob <- function(beta, rho, Xwc, Zpihat, W){
  xb    <- Xwc %*% beta;
  n.obs <- NROW(Xwc);
  
  # Create the 3d matrix of N*(k+1)*(k+1) where each layer is Xi*Xi' #
  XiXiT <- array(t(apply(Xwc,1,tcrossprod)),c(n.obs,NCOL(Xwc),NCOL(Xwc)));
  
  d.2 <- apply(cbind(xb,Zpihat,rho), 1, d.bi.norm);
  
  ddprob.beta <- c((-rho/sqrt(1-rho^2)*dnorm((Zpihat-rho*xb)/sqrt(1-rho^2))-
                       as.vector(xb)*pnorm((Zpihat-rho*xb)/sqrt(1-rho^2)))*dnorm(xb))*XiXiT; # N*n.par*n.par;
  
  ddprob.cross <- -c(as.vector(xb-rho*Zpihat)* d.2)*Xwc;

  ddprob.cross.delta.l <- lapply(1:n.obs, rowprod, A=W, B=ddprob.cross);
  ddprob.cross.delta   <- aperm(abind(ddprob.cross.delta.l,along=3),c(3,1,2)); # N*n.delta*n.par;
  rm(ddprob.cross.delta.l);
  
  ddprob.cross.delta.t.l <- lapply(1:n.obs, rowprod, A=ddprob.cross, B=W);
  ddprob.cross.delta.t   <- aperm(abind(ddprob.cross.delta.t.l,along=3),c(3,1,2)); # N*n.par*n.delta;
  rm(ddprob.cross.delta.t.l);
  
  ddprob.athrho <- c(((xb-rho*Zpihat)*(Zpihat-rho*xb)-rho*(1-rho^2))*d.2)*W;
  ddprob.athrho.delta.l <- lapply(1:n.obs, rowprod, A=ddprob.athrho, B=W);
  ddprob.athrho.delta   <- aperm(abind(ddprob.athrho.delta.l,along=3),c(3,1,2)); # N*n.delta*n.delta;
  rm(ddprob.athrho.delta.l);
  
  dd <- abind(abind(ddprob.beta,ddprob.cross.delta.t ,along=3),
              abind(ddprob.cross.delta,ddprob.athrho.delta,along=3) ,along=2); 
  return(dd);
};


####################################
# Functions related to Threshold y #
####################################
# Log-Likelihood function #
LL <- function(parameter,y,pi,data.local,outcome.formula,selection.formula,rho.formula){
  Y <- data.local[,all.vars(outcome.formula)[1]];
  D <- data.local[,all.vars(selection.formula)[1]];
  
  current.na.action <- options('na.action');
  options(na.action='na.pass');
  n.beta <- NCOL(model.matrix(outcome.formula, data.local));
  n.delta <- NCOL(model.matrix(rho.formula, data.local));
  beta <- parameter[1:n.beta];
  delta <- parameter[(n.beta+1):(n.beta+n.delta)];
  xb <- model.matrix(outcome.formula, data.local) %*% beta;
  Zpihat <- model.matrix(selection.formula, data.local) %*% pi;
  rho <- tanh(model.matrix(rho.formula, data.local) %*% delta); # different for different observations;;
  options(current.na.action);
  
  p.2 <- apply(cbind(xb,Zpihat,rho),1,bi.norm);
  
  ll <- sum(D*(1*(Y<=y)*log(p.2) + 1*(Y>y)*log(pnorm(Zpihat)-p.2)),na.rm=TRUE);
  return(ll);
};


# Gradient of LL function #
Grad <- function(parameter,y,pi,data.local,outcome.formula,selection.formula,rho.formula){
  Y <- data.local[,all.vars(outcome.formula)[1]];
  D <- data.local[,all.vars(selection.formula)[1]];
  
  current.na.action <- options('na.action');
  options(na.action='na.pass');
  n.beta <- NCOL(model.matrix(outcome.formula, data.local));
  n.delta <- NCOL(model.matrix(rho.formula, data.local));
  beta <- parameter[1:n.beta];
  delta <- parameter[(n.beta+1):(n.beta+n.delta)];
  
  xb <- model.matrix(outcome.formula, data.local) %*% beta;
  Zpihat <- model.matrix(selection.formula, data.local) %*% pi;
  rho <- tanh(model.matrix(rho.formula, data.local) %*% delta); # different for different observations;;
  
  p.2 <- apply(cbind(xb,Zpihat,rho),1,bi.norm);
  Grad <- colSums(as.vector(D*(1*(Y<=y)/p.2 - 1*(Y>y)/(pnorm(Zpihat)-p.2)))*
                    dprob(beta,rho,Xwc=model.matrix(outcome.formula, data.local),Zpihat,
                          W=model.matrix(rho.formula, data.local)),na.rm=T);
  
  options(current.na.action);
  
  return(Grad);  
};


# Hessian of LL function #
Hess <- function(parameter,y,pi,data.local,outcome.formula,selection.formula,rho.formula){
  Y <- data.local[,all.vars(outcome.formula)[1]];
  D <- data.local[,all.vars(selection.formula)[1]];
  
  current.na.action <- options('na.action');
  options(na.action='na.pass');
  n.beta <- NCOL(model.matrix(outcome.formula, data.local));
  n.delta <- NCOL(model.matrix(rho.formula, data.local));
  beta <- parameter[1:n.beta];
  delta <- parameter[(n.beta+1):(n.beta+n.delta)];
  
  xb <- model.matrix(outcome.formula, data.local) %*% beta;
  Zpihat <- model.matrix(selection.formula, data.local) %*% pi;
  rho <- tanh(model.matrix(rho.formula, data.local) %*% delta); # different for different observations;;
  
  p.2 <- apply(cbind(xb,Zpihat,rho),1,bi.norm);
  dprobdprob <- array(t(apply(dprob(beta,rho,Xwc=model.matrix(outcome.formula, data.local),Zpihat,
                                    W=model.matrix(rho.formula, data.local)),1,tcrossprod)),
                      c(NROW(data.local),n.beta+n.delta,n.beta+n.delta)); # N*(K+1)*(K+1) matrix
  
  Hess <- colSums(as.vector(D*(-1*(Y<=y)/(p.2^2) - 1*(Y>y)/((pnorm(Zpihat)-p.2)^2)))*dprobdprob +
                    as.vector(D*(1*(Y<=y)/p.2 - 1*(Y>y)/(pnorm(Zpihat)-p.2)))*
                    ddprob(beta,rho,Xwc=model.matrix(outcome.formula, data.local),Zpihat,
                           W=model.matrix(rho.formula, data.local)),
                  na.rm=T);
  
  options(current.na.action);
  
  return(Hess);
};

# Expected Hessian of LL function #
Hess_E <- function(parameter,y,pi,data.local,outcome.formula,selection.formula,rho.formula){ 
  current.na.action <- options('na.action');
  options(na.action='na.pass');
  n.beta <- NCOL(model.matrix(outcome.formula, data.local));
  n.delta <- NCOL(model.matrix(rho.formula, data.local));
  beta <- parameter[1:n.beta];
  delta <- parameter[(n.beta+1):(n.beta+n.delta)];
  
  xb <- model.matrix(outcome.formula, data.local) %*% beta;
  Zpihat <- model.matrix(selection.formula, data.local) %*% pi;
  rho <- tanh(model.matrix(rho.formula, data.local) %*% delta); # different for different observations;;
  
  p.2 <- apply(cbind(xb,Zpihat,rho),1,bi.norm);
  dprobdprob <- array(t(apply(dprob(beta,rho,Xwc=model.matrix(outcome.formula, data.local),Zpihat,
                                    W=model.matrix(rho.formula, data.local)),1,tcrossprod)),
                      c(NROW(data.local),n.beta+n.delta,n.beta+n.delta));
  
  Hess.all <- -as.vector(1/p.2 + 1/(pnorm(Zpihat)-p.2))*dprobdprob;
  Hess.all[is.infinite(Hess.all)] <- 0;
  Hess <- colSums(Hess.all, na.rm=T);
  
  options(current.na.action);
  
  return(Hess);
};



#################################################################################
## Main functions used for 2-Step MLE for bivariate probit with selection model #
#################################################################################

# First stage estimation -- return estimated selection coefficient #
selection.bprob.f <- function(data.local, selection.formula){
  
  ########################
  # Selection Estimation #
  ########################
  
  # Probit D on Z #
  selection.probit <- glm(selection.formula, family=binomial(link="probit"), data=data.local);
  # summary(selection.probit); # for test only
  
  # Predict the propensity score in linear form #
  pihat <- coef(selection.probit);
  
  pise <- rep(0,length(pihat));
  pise[!summary(selection.probit)$aliased] <- summary(selection.probit)$coef[,2]; # =stdEr(selection.probit)
  # estimates.pi[2*g,summary(selection.probit)$aliased] <- NA; 
  # The above line could be uncommented, otherwise the output s.e. for NA coef would just be 0.
  
  return(rbind(pihat,pise));
};



# Second stage estimation -- return the estimates of outcome coefficients and rho #
selection.bprob.s <- function(y,ys,pi,data.local,outcome.formula,selection.formula,rho.formula,initial.val){
  
  ###################
  # Data Processing #
  ###################
  # Generate X with constant(local and temporary) #
  current.na.action <- options('na.action');
  options(na.action='na.pass');
  Xwc <- model.matrix(outcome.formula, data.local);
  
  options(current.na.action);
  
  X.name <- colnames(Xwc);
  
  # Obtain the number of parameters in Xwc and in rho.formula #
  n.par <- NCOL(Xwc);
  n.delta <- NCOL(model.matrix(rho.formula,data.local));
  
  rho.name <- paste("delta.",colnames(model.matrix(rho.formula,data.local)),sep="");
  
  
  # Generate objects to store the final results and the temporary resulst between iterations #
  e.beta <- rep(0, (2*(n.par+n.delta)));
  name.list <- c(X.name, rho.name);
  names(e.beta) <- c(name.list, paste("se.",name.list,sep=""));
  
  
  if(length(initial.val) > 0){
    beta.initial <- initial.val[which(ys==y),1:n.par];
    delta.initial <- initial.val[which(ys==y),(n.par+1):(n.par+n.delta)];
    
    names(beta.initial) <- X.name;
    names(delta.initial) <- rho.name;
    
    ov.index <- which(beta.initial == NA);
  }else{
  ##########################
  # Initial Value for Beta #
  ##########################
  outcome.formula.probit <- update(outcome.formula, ycurrent ~.);
  data.local <- within(data.local, {ycurrent <- 1*(data.local[,all.vars(outcome.formula)[1]]<=y)});
  # Probit 1(Y<=y) on X to have initial value for beta #
  outcome.probit <- glm(outcome.formula.probit, family=binomial(link="probit"), data=data.local, 
                        na.action=na.omit); 
  # summary(outcome.probit); # for test only
  beta.initial <- coef(outcome.probit);
  
  ov.index <- summary(outcome.probit)$aliased;
  
  beta.initial[ov.index] <- 0;
  
  #########################
  # Initial Value for Rho #
  #########################
  # Use Heckman's 2-step method to generate initial value for rho #
  # Reference https://cran.r-project.org/web/packages/sampleSelection/vignettes/selection.pdf #
  # It shows that the "rho" estimated in Heckman model has the opposite sign with the rho in our model #
  if(any(ov.index)){
    heckman.formula <- update(outcome.formula, drop.terms(terms(outcome.formula),which(ov.index)-1));
  }else{
    heckman.formula <- outcome.formula;
  }
  Heckman.2step <- selection(selection=selection.formula, outcome=heckman.formula, method="2-step",data=data.local); # use y instead of 1*(Y<=y)
  rho.initial <- c(-coef(Heckman.2step)["rho"],rep(0, n.delta-1));
  names(rho.initial) <- rho.name;
  delta.initial <- atanh(rho.initial);
  Heckman.coef <- rep(0,n.par);
  Heckman.coef[!ov.index] <- coef(Heckman.2step, part="outcome")[1:n.par];
  
  if(any(abs(Grad(c(beta.initial,delta.initial),y,pi,data.local,
              outcome.formula,selection.formula,rho.formula))==Inf)){
    beta.initial <- rep(0,n.par);
    delta.initial <- 0;
    };
  }; # End of if-else for initial values;
  
  ##############
  # Estimation #
  ##############
  
  MLE <- maxLik(LL,Grad,Hess_E,start=c(beta.initial,delta.initial),fixed=names(beta.initial)[ov.index] ,
                 y=y,pi=pi,data.local=data.local,
                 outcome.formula=outcome.formula,selection.formula=selection.formula,rho.formula=rho.formula);
  
  
  e.beta[1:(n.par+n.delta)] <- coef(MLE);
  e.beta[(n.par+n.delta+1):(2*(n.par+n.delta))] <- stdEr(MLE);
  
  return(e.beta);
};


# Main function to run the 2-step MLE #
Estimation <- function(data.local, ys, outcome.formula, selection.formula, rho.formula, initial.val){
  
  ##########################
  # First Stage Estimation #
  ##########################
  estimates.pi <- selection.bprob.f(data.local, selection.formula);
  
  data.local$pscore <- pnorm(model.matrix(selection.formula, data.local)%*%estimates.pi[1,]);
  
  ###########################
  # Second Stage Estimation #
  ###########################
  estimates.beta.l <- mclapply(as.list(ys),selection.bprob.s, mc.cores=n.cores, ys=ys, pi=estimates.pi[1,], data.local=data.local, 
                               outcome.formula=outcome.formula, selection.formula=selection.formula, rho.formula=rho.formula, 
                               initial.val=initial.val);
  estimates.beta <- do.call(rbind,estimates.beta.l);
  
  elist <- list(pi = estimates.pi, beta = estimates.beta);
  
  return(elist);
};


# Estimate the functional for entire population #
Fy <- function(beta, X.counter){
  beta[is.na(beta)] <- 0;
  return(sum(pnorm(X.counter%*%beta),na.rm=T)/NROW(X.counter));
};

# Estimate the functional for selected population #
Fy.cond <- function(i, pi, beta, athrho, sample.counter, outcome.formula, selection.formula, rho.formula){  
  beta[is.na(beta)]<-0;
  pi[is.na(pi)]<-0;
  
  betai <- beta[i,];
  if(is.null(dim(athrho))){
    if(length(athrho)>1){athrhoi <- athrho[i];}
    else{athrhoi <- athrho;}
  }else{athrhoi <- athrho[i,];};
  
  rho <- tanh(model.matrix(rho.formula,sample.counter) %*% athrhoi);
  
  current.na.action <- options('na.action');
  options(na.action='na.pass');
  xb     <- model.matrix(outcome.formula, sample.counter) %*% betai;
  Zpihat <- model.matrix(selection.formula, sample.counter) %*% pi;
  
  options(current.na.action);
  
  bipnorm <- apply(cbind(xb,Zpihat,rho),1,bi.norm);
  
  return(sum(bipnorm, na.rm=T)/sum(pnorm(Zpihat), na.rm=T));
};

#########################
## Influence Functions ##
#########################
# Estimate the influence function values of selection coefficients #
phi.pi <- function(pi, data.local, selection.formula, counter.id, population){
  # Generate indicators for population selection #
  if(length(population)==1){
    pop.ind <- data.local[,counter.id]==population;
  }else{
    pop.ind <- is.element(data.local[,counter.id], population);
  }
  n.obs <- NROW(data.local);
  
  # w is the adjustment weight for the population selection #
  w <- n.obs*1*pop.ind/sum(1*pop.ind,na.rm=T);
  
  D <- data.local[, all.vars(selection.formula)[1]];
  Z.name <- colnames(model.matrix(selection.formula, data.local));
  
  Zpi <- model.matrix(selection.formula, data.local) %*% pi;
  
  # Expected Hessian for first stage:
  H <- -colSums(w*c(as.vector(1/pnorm(Zpi)+1/pnorm(-Zpi))*(dnorm(Zpi))^2)*
                  array(t(apply(model.matrix(selection.formula, data.local), 1, tcrossprod)),
                        c(n.obs,length(Z.name),length(Z.name))), na.rm=T)/n.obs;
  
  if(any(is.infinite(H))){
    invH <- matrix(0,nrow=length(Z.name),ncol=length(Z.name));
  }else{
    invH <- ginv(-H); # (K+1)-by-(P+1) matrix;
  };
  
  phi <- c(as.vector(w*(D/pnorm(Zpi)-(1-D)/pnorm(-Zpi)))*dnorm(Zpi))*model.matrix(selection.formula, data.local) %*% 
    invH; # N-by-(P+1) matrix;
  colnames(phi) <- Z.name;
  
  return(phi);
};


# Estimate the influence function of outcome coefficients #
phi.theta <- function(y,parameter.ys, ys.ind, pi, data.local, 
                      outcome.formula, selection.formula, rho.formula, counter.id, population){
  # Generate indicators for population selection #
  if(length(population)==1){
    pop.ind <- data.local[,counter.id]==population;
  }else{
    pop.ind <- is.element(data.local[,counter.id], population);
  }
  n.obs <- NROW(data.local);
  
  # w is the adjustment weight for the population selection #
  w <- n.obs*1*pop.ind/sum(1*pop.ind,na.rm=T);
  
  Y <- data.local[, all.vars(outcome.formula)[1]];
  D <- data.local[, all.vars(selection.formula)[1]];
  
  current.na.action <- options('na.action');
  options(na.action='na.pass');
  n.beta <- NCOL(model.matrix(outcome.formula, data.local));
  n.delta <- NCOL(model.matrix(rho.formula, data.local));
  parameter <- parameter.ys[ys.ind==y,];
  beta <- parameter[1:n.beta];
  delta <- parameter[(n.beta+1):(n.beta+n.delta)];
  X.name <- colnames(model.matrix(outcome.formula, data.local));
  rho.name <- paste("delta.",colnames(model.matrix(rho.formula, data.local)),sep="");
  
  xb <- model.matrix(outcome.formula, data.local) %*% beta;
  Zpihat <- model.matrix(selection.formula, data.local) %*% pi;
  rho <- tanh(model.matrix(rho.formula, data.local) %*% delta); # different for different observations;;
  
  p.2 <- apply(cbind(xb,Zpihat,rho),1,bi.norm);
  d.2 <- dprob(beta,rho,model.matrix(outcome.formula, data.local),Zpihat,model.matrix(rho.formula, data.local));

  # Score and Hessian functions #
  Grad_i <- as.vector(w*D*(1*(Y<=y)/p.2 - 1*(Y>y)/(pnorm(Zpihat)-p.2)))*d.2; # N-by-(K+1) matrix
  Grad_i[is.na(Grad_i)|is.infinite(Grad_i)] <- 0;
  
  H <- Hess_E(parameter,y,pi,data.local[pop.ind,],outcome.formula,selection.formula,rho.formula)/sum(1*pop.ind,na.rm=T);
  
  if(any(is.infinite(H))){
    invH <- matrix(0,nrow=NCOL(Xwc)+1,ncol=NCOL(Xwc)+1);
  }else{
    invH <- ginv(-H); # (K+1)-by-(K+1) matrix
    # There is no weight w in Hess or Hess_E function, so the denominator here is the number of the specific population
  };
  
  dprobZiT <- aperm(abind(lapply(as.list(1:n.obs),rowprod, A=d.2, B=model.matrix(selection.formula, data.local)),
                          along=3),c(3,1,2)); # N*(K+1)*(P+1) matrix
  
  # Expected R matrix:
  R.all <- as.vector(w*(-1/p.2*c(pnorm((xb-rho*Zpihat)/sqrt(1-rho^2)))-
                          1/(pnorm(Zpihat)-p.2)*c(pnorm((-xb+rho*Zpihat)/sqrt(1-rho^2))))*dnorm(Zpihat))*dprobZiT;
  R.all[is.infinite(R.all)] <- 0;
  R <- colSums(R.all, na.rm=T)/n.obs;
  rm(R.all);
  
  phi <- (Grad_i + phi.pi(pi, data.local, selection.formula, counter.id, population)%*%t(R)) %*% invH ; # N-by-(K+1) matrix
  colnames(phi) <- c(X.name, rho.name);
  
  options(current.na.action);
  
  return(phi);
};


# Estimate the influence function of distribution fuctions #
phi.F <- function(y, parameter.ys, ys.ind, F_hat.ys, phi.para.ys, data.local, 
                  outcome.formula, selection.formula, rho.formula, counter.id, population){
  # Generate indicators for population selection #
  pop.ind <- data.local[,counter.id]==population;
  n.obs <- NROW(data.local);
  
  # w is the adjustment weight for the population selection #
  w <- n.obs*1*pop.ind/sum(1*pop.ind);
  
  current.na.action <- options('na.action');
  options(na.action='na.pass');
  Xwc <- model.matrix(outcome.formula, data.local);
  
  options(current.na.action);
  
  n.delta <- NCOL(model.matrix(rho.formula, data.local));
  
  
  parameter <- parameter.ys[ys.ind==y,];
  F_hat <- F_hat.ys[ys.ind==y];
  
  beta <- parameter[1:NCOL(Xwc)];
  xb <- Xwc %*% beta;
  
  # This is already the influence function for theta #
  phi.para <- phi.para.ys[[which(ys.ind==y)]]; # extract the N*(n.par+n.delta) matrix from a list object
  
  J <- c(colSums(c(as.vector(w)*dnorm(xb))*Xwc, na.rm=T)/n.obs, rep(0,n.delta));
  
  phi <- w*pnorm(xb) - F_hat + phi.para%*%J; # N-by-1 vector
  
  
  return(phi);
};

# Estimate the influence function of conditional distribution functions #
phi.F.cond <- function(y, parameter.ys, ys.ind, pi, F_hat.ys, phi.para.ys.beta, phi.pi.beta, 
                       phi.para.ys.rho, phi.pi.rho, phi.pi.pi, data.local, 
                       outcome.formula, selection.formula, rho.formula, counter.id, population.v){
  # populatio.v = (population, beta, rho, pi);
  population <- population.v[1];
  # Generate indicators for population selection #
  pop.ind <- data.local[,counter.id]==population;
  n.obs <- NROW(data.local);
  
  # w is the adjustment weight for the population selection #
  w <- n.obs*pop.ind/sum(pop.ind);
  
  current.na.action <- options('na.action');
  options(na.action='na.pass');
  Xwc <- model.matrix(outcome.formula, data.local);
  Zwc <- model.matrix(selection.formula, data.local);
  Wwc <- model.matrix(rho.formula, data.local);
  
  options(current.na.action);
  
  parameter <- parameter.ys[ys.ind==y,,];
  F_hat <- F_hat.ys[ys.ind==y];
  
  beta <- parameter[1:NCOL(Xwc),population.v[2]];
  delta <- parameter[(NCOL(Xwc)+1):(NCOL(Xwc)+NCOL(Wwc)),population.v[3]];
  
  rho <- tanh(Wwc %*% delta); # different for different observations; 
  
  
  xb <- Xwc%*%beta;
  Zpihat <- Zwc%*%pi[,population.v[4]];
  

  bipnorm <- apply(cbind(xb,Zpihat,rho),1,bi.norm);
  J <- colSums(w*dprob(beta,rho,Xwc,Zpihat,Wwc), na.rm=T)/sum(w*pnorm(Zpihat),na.rm=T);
  Q <- colSums(w*c(pnorm((xb-rho*Zpihat)/sqrt(1-rho^2))*dnorm(Zpihat))*Zwc,
                    na.rm=T)/sum(w*pnorm(Zpihat),na.rm=T) - 
     colSums(c(w*dnorm(Zpihat))*Zwc,na.rm=T)*sum(w*bipnorm,na.rm=T)/(sum(w*pnorm(Zpihat),na.rm=T)^2);
  
  
  # This is already the influence function for theta #
  phi.para.beta <- phi.para.ys.beta[[which(ys.ind==y)]]; # extract the N*(K+1) matrix from a list object
  phi.para.rho  <- phi.para.ys.rho[[which(ys.ind==y)]];
  
  
  phi <- w*bipnorm/pnorm(Zpihat) - F_hat + phi.para.beta%*%J + phi.pi.beta%*%Q +
          1*(population.v[2]!=population.v[3])*(phi.para.rho%*%J + phi.pi.rho%*%Q) +
          1*(population.v[2]!=population.v[4] & population.v[3]!=population.v[4])*(phi.pi.pi%*%Q); # N-by-1 vector
  
  
  return(phi);
};


# Estimate the influence function of conditional distribution functions #
phi.P <- function(pi, P, data.local, outcome.formula, selection.formula, rho.formula, counter.id, population){
  # Generate indicators for population selection #
  pop.ind <- data.local[,counter.id]==population;
  n.obs <- NROW(data.local);
  
  # w is the adjustment weight for the population selection #
  w <- n.obs*1*pop.ind/sum(1*pop.ind);
  
  current.na.action <- options('na.action');
  options(na.action='na.pass');
  Wwc <- model.matrix(selection.formula, data.local);
  
  options(current.na.action);
  
  zp <- Wwc %*% pi;
  J <- colSums(c(as.vector(w)*dnorm(zp))*Wwc, na.rm=T)/n.obs;
  
  phi <- w*pnorm(zp) - P + phi.pi(pi, data.local, selection.formula, counter.id, population)%*%J; # N-by-1 vector
  
  return(phi);
};


# Bootstrap function using influence function to get critical value of maximal t-stat #
boot.func <- function(b.boot, phi, variance, var.boot){
  # phi is a N*P*n.thres matrix, containing influence functions for P estimates.
  n.obs <- NROW(phi);
  
  # Generate weights #
  set.seed(b.boot);
  w.bt <- rnorm(n.obs);
  w.b <- w.bt - mean(w.bt);
  rm(w.bt);
    
  delta <- colMeans(phi*w.b); # P*n.thres or P*1
    
  if(var.boot == 1){variance <- colMeans((phi*w.b)^2)/n.obs;};
  delta[variance==0] <- 0;
    
  t.stat <- abs(delta)/sqrt(variance);
  
  if(length(dim(phi))==2){
    result <- rbind(delta,t.stat);
  }else{
    result <- t(cbind(delta, t.stat));
  };
  
  # Return a (n.thres+n.thres)*P or 2*P of deltas and t.stat #
  return(result); 
};

# Bootstrap function for percentage change decomposition of distribution functions #
boot.func.pct <- function(b.boot, F.tl, F.tr, F.bl, F.br, phi.tl, phi.tr, phi.bl, phi.br, ys, taus){
  # phi is a N*P*n.thres matrix, containing influence functions for P estimates.
  n.obs <- NROW(phi.tl);
  
  # Generate weights #
  set.seed(b.boot);
  w.bt <- rnorm(n.obs);
  w.b <- w.bt - mean(w.bt);
  rm(w.bt);
  
  # Bootstrapped DF's #
  F.tl.b <- F.tl + colMeans(phi.tl*w.b); 
  F.tr.b <- F.tr + colMeans(phi.tr*w.b);
  F.bl.b <- F.bl + colMeans(phi.bl*w.b);
  F.br.b <- F.br + colMeans(phi.br*w.b);
  
  QF.tl.b   <- approxfun(F.tl.b, y=ys, rule=2);
  QF.tr.b   <- approxfun(F.tr.b, y=ys, rule=2);
  QF.bl.b   <- approxfun(F.bl.b, y=ys, rule=2);
  QF.br.b   <- approxfun(F.br.b, y=ys, rule=2);
  pct.b <- function(x){(QF.tl.b(x) - QF.tr.b(x))/(QF.bl.b(x) - QF.br.b(x))*100};
  
  # Return bootstrapped percentage #
  return(pct.b(taus)); # n.thres*1
};


boot.func.pct.DF <- function(b.boot, F.tl, F.tr, F.bl, F.br, phi.tl, phi.tr, phi.bl, phi.br){
  # phi is a N*P*n.thres matrix, containing influence functions for P estimates.
  n.obs <- NROW(phi.tl);
  
  # Generate weights #
  set.seed(b.boot);
  w.bt <- rnorm(n.obs);
  w.b <- w.bt - mean(w.bt);
  rm(w.bt);
  
  # Bootstrapped DF's #
  F.tl.b <- F.tl + colMeans(phi.tl*w.b); 
  F.tr.b <- F.tr + colMeans(phi.tr*w.b);
  F.bl.b <- F.bl + colMeans(phi.bl*w.b);
  F.br.b <- F.br + colMeans(phi.br*w.b);
  
  pct.b <- (F.tl.b - F.tr.b)/(F.bl.b - F.br.b)*100;
  
  # Return bootstrapped percentage #
  return(pct.b); # n.thres*1
};


boot.func.QF <- function(b.boot, F.hat, phi.F, ys, taus){
  # phi is a N*P*n.thres matrix, containing influence functions for P estimates.
  n.obs <- NROW(phi.F);
  
  # Generate weights #
  set.seed(b.boot);
  w.bt <- rnorm(n.obs);
  w.b <- w.bt - mean(w.bt);
  rm(w.bt);
  
  # Bootstrapped DF's #
  if(length(dim(phi.F))==1){
    F.b <- F.hat + mean(phi.F*w.b);
    QF.b   <- approxfun(F.b, y=ys, rule=2)(taus);
  }else{
    F.b <- F.hat + colMeans(phi.F*w.b);
    QF.b <- sapply(1:NCOL(F.b), function(i){approxfun(F.b[,i], y=ys, rule=2)(taus)})
  };
  
  # Return bootstrapped percentage #
  return(QF.b); # n.thres*1
};

